{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MAXP 2021初赛数据探索和处理-2\n",
    "\n",
    "处理Feature，并结合第1部分里得到的不匹配的节点，生成新的Feature，并保存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path\n",
    "base_path = '/Users/jamezhan/PycharmProjects/MAXP/final_dataset'\n",
    "publish_path = 'publish'\n",
    "\n",
    "diff_node_path = os.path.join(base_path, publish_path, 'diff_nodes.csv')\n",
    "train_nodes_path = os.path.join(base_path, publish_path, 'train_nodes.csv')\n",
    "val_nodes_path = os.path.join(base_path, publish_path, 'validation_nodes.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 处理节点的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_node(line):\n",
    "    nid, feat_json, label = line.strip().split('\\\"')\n",
    "    \n",
    "    feat_list = [float(feat[1:-1]) for feat in feat_json[1:-1].split(', ')]\n",
    "    \n",
    "    if len(feat_list) != 300:\n",
    "        print('此行数据有问题 {}'.format(line))\n",
    "    \n",
    "    return nid[:-1], feat_list, label[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 100000 train rows\n",
      "Processed 200000 train rows\n",
      "Processed 300000 train rows\n",
      "Processed 400000 train rows\n",
      "Processed 500000 train rows\n",
      "Processed 600000 train rows\n",
      "Processed 700000 train rows\n",
      "Processed 800000 train rows\n",
      "Processed 900000 train rows\n",
      "Processed 1000000 train rows\n",
      "Processed 1100000 train rows\n",
      "Processed 1200000 train rows\n",
      "Processed 1300000 train rows\n",
      "Processed 1400000 train rows\n",
      "Processed 1500000 train rows\n",
      "Processed 1600000 train rows\n",
      "Processed 1700000 train rows\n",
      "Processed 1800000 train rows\n",
      "Processed 1900000 train rows\n",
      "Processed 2000000 train rows\n",
      "Processed 2100000 train rows\n",
      "Processed 2200000 train rows\n",
      "Processed 2300000 train rows\n",
      "Processed 2400000 train rows\n",
      "Processed 2500000 train rows\n",
      "Processed 2600000 train rows\n",
      "Processed 2700000 train rows\n",
      "Processed 2800000 train rows\n",
      "Processed 2900000 train rows\n",
      "Processed 3000000 train rows\n",
      "Processed 100000 validation rows\n",
      "Processed 200000 validation rows\n",
      "Processed 300000 validation rows\n",
      "Processed 400000 validation rows\n",
      "Processed 500000 validation rows\n"
     ]
    }
   ],
   "source": [
    "# 下面处理特征Feature并存储亿备后用\n",
    "# nid_list = []\n",
    "feature_list = []\n",
    "\n",
    "with open(train_nodes_path, 'r') as f:\n",
    "    i = 0\n",
    "    \n",
    "    for line in f:\n",
    "        if i > 0:\n",
    "            _, features, _ = process_node(line)\n",
    "#             nid_list.append(nid)\n",
    "            feature_list.append(features)\n",
    "        i += 1\n",
    "        if i % 100000 == 0:\n",
    "            print('Processed {} train rows'.format(i))\n",
    "    \n",
    "with open(val_nodes_path, 'r') as f:\n",
    "    i = 0\n",
    "    \n",
    "    for line in f:\n",
    "        if i > 0:\n",
    "            _, features, _ = process_node(line)\n",
    "#             nid_list.append(nid)\n",
    "            feature_list.append(features)\n",
    "        i += 1\n",
    "        if i % 100000 == 0:\n",
    "            print('Processed {} validation rows'.format(i))\n",
    "            \n",
    "# nid_arr = np.array(nid_list)\n",
    "feat_arr = np.array(feature_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除list以节省内存\n",
    "del feature_list\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 给未匹配上的419个节点造300维的特征，这里用其他所有节点的300维的平均值来作为他们的特征\n",
    "# 更好的方法是用每个节点的所有邻居的特征的平均，这里就不搞这么复杂了。\n",
    "diff_node_feat_arr = np.tile(np.mean(feat_arr, axis=0),(419, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3655452, 300)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feat_arr = np.concatenate((feat_arr, diff_node_feat_arr), axis=0)\n",
    "feat_arr.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用Numpy保存特征为.npy格式，以节省存储空间和提高读写速度\n",
    "with open(os.path.join(base_path, publish_path, './features.npy'), 'wb') as f:\n",
    "    np.save(f, feat_arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dgl]",
   "language": "python",
   "name": "conda-env-dgl-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
